Introduction

This figure shows how/if different phenotypes of differentiated and undifferentiated cells of BE are maintained in cancer.

library("edgeR")
library("ggplot2")
library("pheatmap")
library("openxlsx")
library("RColorBrewer")
library("reshape2")
library("ggpubr")
library("knitr")
library("gridExtra")
# library('biomaRt')
files.dir <- "~/Dropbox/Postdoc/2019-12-29_BE2020/RNA-seq/"
files.out <- "~/Dropbox/Postdoc/2019-12-29_BE2020/Figures/"
anno_col <- list(Project.ID = c(Gastric = "#4DBBD5FF", Duodenum = "#3C5488FF", `Barrett's` = "#00A087FF", 
    `ICGC-ESAD` = "#D18118FF", Squamous = "darkred"), Tissue = c(NG = "#4DBBD5FF", 
    NE = "dark red", BE = "#00A087FF", ND = "#3C5488FF", SMG = "#B09C85FF"), RP.BarrettsAdjacent = c(ND = "grey", 
    yes = "#E0B400FF", no = "#00DCAFFF"))
breaksListZ = seq(-2, 2, by = 0.05)

Prepare data from bulk RNA-seq

EAC data

Let’s begin by loading all of the data

# EAC and normal data (tmp, linear scale)
bulk.data <- readRDS(paste0(files.dir, "/EAC-all/count_codingGenes_clean.rds"))

################# use this coded to download the annotation and store re in the genesID.rds
################# ensembl = useMart('ensembl') ensembl = useDataset('hsapiens_gene_ensembl', mart
################# = ensembl) genesID<-getBM(attributes = c('hgnc_symbol', 'ensembl_gene_id',
################# 'gene_biotype'), mart = ensembl) saveRDS(genesID, file = paste0(files.dir,
################# '/Figure_5C/genesID.rds'))

genesID <- readRDS(paste0(files.dir, "/genesID.rds"))

# remove duplicated entries (keep the first gene on the list in)
genesID2 <- genesID[genesID$hgnc_symbol %in% rownames(bulk.data), ]
genesID2 <- genesID2[!(duplicated(genesID2$hgnc_symbol)), ]
genesID2 <- genesID2[!(duplicated(genesID2$ensembl_gene_id)), ]

bulk.data <- bulk.data[genesID2$hgnc_symbol, ]
rownames(bulk.data) <- genesID2$ensembl_gene_id

metadata

Let’s all all of the metadata

# get metadata

# get information about the samples from EAC
rowID <- readRDS(paste0(files.dir, "/EAC-all/samplesAnnotation_clean.rds"))

# #keep only data for sample group and clinical info about Barrett's next to
# cancer
colnames(rowID)[6] <- "Project.ID"
rowID$Project.ID[rowID$Project.ID == "EAC"] <- "ICGC-ESAD"
rowID$Project.ID[rowID$Project.ID == "Barretts"] <- "Barrett's"
rowID$Sample.Type[rowID$Project.ID == "ICGC-ESAD"] <- "Primary Tumor"
rowID$Sample.Type[rowID$Project.ID != "ICGC-ESAD"] <- "Solid Tissue Normal"
rowID$RP.BarrettsAdjacent[is.na(rowID$RP.BarrettsAdjacent)] <- "ND"
rowID$RP.BarrettsAdjacent[rowID$RP.BarrettsAdjacent == "unknown"] <- "ND"


rowID$primary_diagnosis[rowID$Project.ID == "ICGC-ESAD"] <- "Adenocarcinoma"
rowID$primary_diagnosis[rowID$Project.ID != "ICGC-ESAD"] <- rowID$Project.ID[rowID$Project.ID != 
    "ICGC-ESAD"]
rowID$site_of_resection_or_biopsy <- "ND"
rowID$Case.ID <- row.names(rowID)
rowID$Sample.ID <- rowID$Case.ID

EdgeR DE analysis

Prep data

Let’s create edgeR object that will carry all of the information. I include all of the samples. From this object I will get normalized, batch corrected expression values

group <- factor(rowID$primary_diagnosis)
batch <- factor(rowID$batch)

y <- DGEList(bulk.data, group = group)


# #keep only genes with expression above 1 cpm in at least 3 samples
keep <- rowSums(edgeR::cpm(y) > 1) >= 3
y <- y[keep, , keep.lib.sizes = FALSE]

# get normalization factors
y <- calcNormFactors(y)

# Start performing diff expression - get sample desing model
design <- model.matrix(~batch + group)
# design <- model.matrix(~ 0 + group)
rownames(design) <- colnames(y)

# colnames(design) <- levels(group)

# Estimate disparsion
y <- estimateDisp(y, design, robust = TRUE)

plotBCV(y)

# Fit general linear model data
fit <- glmQLFit(y, design, robust = TRUE)

plotQLDisp(fit)

All samples

Let’s perform multidimension analysis of the data using building MDS function from edgeR and PC analysis.

# This can be used to get outliers
points <- c(0, 1, 2, 1, 15, 16, 17)
points <- rep(c(0, 1, 2), 3)
colors <- rep(c("blue", "darkgreen", "red", "black"), 3)


p <- limma::plotMDS(y, col = colors[group], pch = points[group], plot = FALSE)


p2 <- data.frame(p$x, p$y, .names = rowID$primary_diagnosis, .names2 = rowID$batch)


ggplot(data = p2) + geom_point(mapping = aes(x = p.x, y = p.y, colour = .names), 
    show.legend = TRUE) + theme_bw() + labs(x = paste0(p$axislabel, " 1"), y = paste0(p$axislabel, 
    " 2")) + theme(plot.title = element_text(hjust = 0.5)) + ggtitle("MDS analysis")

ggplot(data = p2) + geom_point(mapping = aes(x = p.x, y = p.y, colour = .names2), 
    show.legend = TRUE) + theme_bw() + labs(x = paste0(p$axislabel, " 1"), y = paste0(p$axislabel, 
    " 2")) + theme(plot.title = element_text(hjust = 0.5)) + ggtitle("MDS analysis")

All samples - HVG

For the analysis here I use 500 top HGV. All samples

# Create gene count matrix that will be used for expression display
geneMatrix <- edgeR::cpm(y, prior.count = 1, log = TRUE, normalized.lib.sizes = TRUE)
countVar <- apply(geneMatrix, 1, var)

highVar <- order(countVar, decreasing = TRUE)[1:500]
hmDat <- geneMatrix[highVar, ]

pca <- prcomp(t(hmDat), center = TRUE, scale. = TRUE)

loadings <- data.frame(pca$x, .names = rowID$primary_diagnosis, .names2 = rowID$batch)

ggplot(data = loadings) + geom_point(mapping = aes(x = PC1, y = PC2, colour = .names), 
    show.legend = TRUE) + theme_bw() + labs(x = paste0("PC1, ", round(summary(pca)$importance[2, 
    1] * 100, 2), "% variance"), y = paste0("PC2, ", round(summary(pca)$importance[2, 
    2] * 100, 2), "% variance")) + theme(plot.title = element_text(hjust = 0.5)) + 
    ggtitle("PCA - 500 HVGs")

ggplot(data = loadings) + geom_point(mapping = aes(x = PC1, y = PC2, colour = .names2), 
    show.legend = TRUE) + theme_bw() + labs(x = paste0("PC1, ", round(summary(pca)$importance[2, 
    1] * 100, 2), "% variance"), y = paste0("PC2, ", round(summary(pca)$importance[2, 
    2] * 100, 2), "% variance")) + theme(plot.title = element_text(hjust = 0.5)) + 
    ggtitle("PCA - 500 HVGs")

pheatmap(hmDat, annotation_col = rowID[, c(1, 4, 6)], clustering_method = "ward.D2", 
    main = "500 HVGs, all data", show_rownames = FALSE)

Eso cancers and normal samples - BE phenotype genes

For the analysis here I use BE phenotype cell markers. Eso cancers and normal cells.

# read marker genes for BE and GC Here I am using data from table S9 that has
# specific marker genes for these tissue types
files.dir2 <- "~/Dropbox/Postdoc/2019-12-29_BE2020/Figures/Supplemental_tables/"


# load workbook with marker genes
tmp <- loadWorkbook(paste0(files.dir2, "Table_S9.xlsx"))

# create a list with markers
markers <- list()

# read the content of marker files
for (i in names(tmp)) {
    markers[[i]] <- read.xlsx(tmp, sheet = i)
}



# read marker genes for endocrine cells
markers[["Endocrine"]] <- read.xlsx(loadWorkbook(paste0(files.dir2, "Table_S7.xlsx")), 
    sheet = "Endocrine_NEUROG3")

# Keep only markers that are over 1 logFC and FDR <0.1
markers[["Endocrine"]] <- markers[["Endocrine"]][apply(markers[["Endocrine"]][, 1:10], 
    1, min) > 1 & markers[["Endocrine"]]$FDR < 0.1, ]

# read marker genes for goblet cells
markers[["Goblet"]] <- read.xlsx(loadWorkbook(paste0(files.dir2, "Table_S7.xlsx")), 
    sheet = "Goblet")

# Keep only markers that are over 1 logFC and FDR <0.1
markers[["Goblet"]] <- markers[["Goblet"]][apply(markers[["Goblet"]][, 1:10], 1, 
    min) > 1 & markers[["Goblet"]]$FDR < 0.1, ]

Differential expression testing between cell types

Here, we perform DE between endrocrine cells of NG and BE

source("../Analysis/Functions/auxiliary.R")
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:limma':
## 
##     plotMA
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## Loading required package: BiocParallel
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum
## 
## Attaching package: 'SingleCellExperiment'
## The following object is masked from 'package:edgeR':
## 
##     cpm
## 
## Attaching package: 'scater'
## The following object is masked from 'package:limma':
## 
##     plotMDS
## 
## Attaching package: 'destiny'
## The following object is masked from 'package:SummarizedExperiment':
## 
##     distance
## The following object is masked from 'package:GenomicRanges':
## 
##     distance
## The following object is masked from 'package:IRanges':
## 
##     distance
# Read all scRNA-seq data
sce <- readRDS("~/Dropbox/Postdoc/2019-12-29_BE2020/All_corrected_sce_filtered.rds")

# Keep only data for NG and BE
cur_sce <- sce[,sce$include & sce$Tissue %in% c("NG", "BE")]# & sce$Patient != "Patient06"]

# Remove genes that are not expressed
cur_sce <- cur_sce[Matrix::rowSums(counts(cur_sce)) > 0,]

# DE within undiff cells tissue
sce_test <- cur_sce[,cur_sce$cell_type == "Endocrine"]

#Remove patients with fewer than 10 cells (removes 1 patients NG sample that had 1 cells)
sce_test$groups<-paste(sce_test$Patient, sce_test$Tissue, sce_test$cell_type, sep = "_")
sce_test <- sce_test[,sce_test$groups %in% names(table(sce_test$groups)[table(sce_test$groups)>=10])]



Endo_BE_vs_NG <- DE.edgeR(sce_test, conditions = sce_test$Tissue,
                          covariate = sce_test$Patient, lfc = 0.5,
                          FDR = 1)

[1] “Patient02 NG” “Patient13 NG” “Patient07 NG” “Patient03 NG” “Patient09 NG” [6] “Patient01 NG” “Patient10 NG” “Patient08 NG” “Patient07 BE” “Patient03 BE” [11] “Patient14 BE” “Patient09 BE”

Endo_BE_vs_NG <- rbind(Endo_BE_vs_NG$NG,
                       Endo_BE_vs_NG$BE[
                         seq(nrow(Endo_BE_vs_NG$BE), 1),])

Endo_BE_vs_NG$ID <- rownames(Endo_BE_vs_NG)

Endo_BE_vs_NG$specificity <- NA
Endo_BE_vs_NG$specificity[Endo_BE_vs_NG$logFC < 0 & Endo_BE_vs_NG$FDR <= 0.1] <- "NG"
Endo_BE_vs_NG$specificity[Endo_BE_vs_NG$logFC > 0 & Endo_BE_vs_NG$FDR <= 0.1] <- "BE"


# Identify BE specific genes
# Endocrine
# BE specific
cur_1 <- Endo_BE_vs_NG[Endo_BE_vs_NG$specificity == "BE" &
                         !is.na(Endo_BE_vs_NG$specificity),]
cur_2 <- markers[["Endocrine"]]
rownames(cur_2) <- cur_2$ID
shared.genes <- intersect(rownames(cur_1), rownames(cur_2))
BE_endo <- cbind(cur_1[shared.genes,], cur_2[shared.genes,])
BE_endo <- BE_endo[,c(1,5,9:32,8)]

colnames(BE_endo)[1:24] <- c(paste(colnames(BE_endo)[1:2], "BE_vs_NG", sep = "."),
                             paste("Endo_vs",colnames(BE_endo)[3:22],  sep = "_"),
                             colnames(BE_endo)[23],
                             "Combined_vs_other_FDR")

# Order table
BE_endo <- BE_endo[order(rowMeans(BE_endo[,c(2,24)]), decreasing = FALSE),]


markers[["Endocrine2"]] <- BE_endo

write.xlsx(BE_endo, file = "~/Dropbox/Postdoc/2019-12-29_BE2020/Figures/Supplemental_tables/Table_S10.xlsx")

Goblet and differentiated in BE only genes

selection <- c("Barrett's", "Squamous", "Gastric", "Duodenum", "ICGC-ESAD")
hmDat <- geneMatrix[, rownames(rowID)[rowID$Project.ID %in% selection]]

#################################################################################### get marker genes for goblet cells and BE differentiated cells
iiii <- c("Goblet", "BE_diff_specific")
# iiii<-c( 'BE_diff_specific')
genes.list <- vector()
for (i in iiii) {
    genes.list <- unique(c(genes.list, markers[[i]]$ID))
}

# plot z-score data
pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "BE Diff markers CAM BE, GC, D2 and EAC - z-score", 
    show_rownames = FALSE, cluster_rows = T, scale = "row", breaks = breaksListZ, 
    color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(length(breaksListZ)), 
    show_colnames = FALSE, annotation_colors = anno_col)  #,annotation_row = IDs)

# plot raw expression data
pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "BE Diff markers CAM BE, GC, D2 and EAC", 
    show_rownames = FALSE, cluster_rows = TRUE, cutree_cols = 1, show_colnames = FALSE, 
    annotation_colors = anno_col)  #,annotation_row = IDs)#, scale='row', breaks = breaksListZ,color = colorRampPalette(rev(brewer.pal(n = 7, name = 'RdBu')))(length(breaksListZ)))

########################################### Scaled to BE samples only########

# get the data for all important genes
hmDat2 <- hmDat[rownames(hmDat) %in% genes.list, ]

# scale the data around the mean expression in BE samples without any scaling
# factor. I use scale fuction to do it with center and scale arguments. Then
# calculate the median expression of scaled expression in each of the samples.

hmDat2 <- apply(apply(hmDat2, 1, function(x) scale(x, center = mean(x[rownames(rowID)[rowID$primary_diagnosis == 
    "Barrett's"]]), scale = FALSE)), 1, median)


# fix the names
names(hmDat2) <- colnames(hmDat[rownames(hmDat) %in% genes.list, ])

# merge the median expression per sample with other clinical information
hmDat2 <- merge(rowID, hmDat2, by = 0, all.y = TRUE)

# remove cancer samples without Barrett's status
hmDat2 <- hmDat2[!(hmDat2$RP.BarrettsAdjacent == "ND" & hmDat2$Project.ID == "ICGC-ESAD"), 
    ]
hmDat2$is.normal <- ifelse(hmDat2$Project.ID %in% c("Squamous", "Gastric", "Duodenum"), 
    "Normal", "NotNormal")
hmDat2$final <- paste0(hmDat2$Project.ID, hmDat2$is.normal, hmDat2$RP.BarrettsAdjacent)
hmDat2$colour <- paste0(hmDat2$is.normal, hmDat2$RP.BarrettsAdjacent)

# order the samples for the boxplots
hmDat2$Project.ID <- factor(hmDat2$Project.ID, levels = c("Squamous", "Gastric", 
    "Duodenum", "Barrett's", "ICGC-ESAD"))
hmDat2$RP.BarrettsAdjacent <- factor(hmDat2$RP.BarrettsAdjacent, levels = c("yes", 
    "no", "ND"))
hmDat2$RP.TumourGradingDifferentiationStatus <- factor(hmDat2$RP.TumourGradingDifferentiationStatus, 
    levels = c("poor", "moderate_to_poor", "moderate", "moderate_to_well", "well", 
        "unknown", NA))
hmDat2$final <- factor(hmDat2$final, levels = c("SquamousNormalND", "GastricNormalND", 
    "DuodenumNormalND", "Barrett'sNotNormalND", "ICGC-ESADNotNormalyes", "ICGC-ESADNotNormalno", 
    "ICGC-ESADNotNormalND"))

# plot all data without subgroups
p_all <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("All") + geom_point(position = position_jitterdodge(jitter.width = 0.5, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + 
    theme_bw() + ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))
p_all

# remove duodenum from the figure
hmDat3 <- hmDat2[hmDat2$primary_diagnosis != "Duodenum", ]
# plot all data without subgroups
p_all <- ggplot(hmDat3, aes(x = Project.ID, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle(paste0("All without Duodenum ", 
    length(genes.list))) + geom_point(position = position_jitterdodge(jitter.width = 0.5, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + 
    theme_bw() + ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(3, 
    4)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))
p_all

p_diff <- p_all + theme(legend.position = "none")

p_all <- ggplot(hmDat3, aes(x = final, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle(paste0("All without Duodenum ", 
    length(genes.list))) + geom_point(position = position_jitterdodge(jitter.width = 0.5, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + 
    theme_bw() + ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5), c(3, 4), c(3, 5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 
    0.05, 1), symbols = c("****", "***", "**", "*", "ns")))
p_all

p_diff_BE <- p_all + theme(legend.position = "none")



# plot all data with cancers split according to Barrett's status
p <- ggplot(hmDat2, aes(x = final, y = y, fill = colour)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.8) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("Barrett's next to") + 
    geom_point(position = position_jitterdodge(jitter.width = 0.4, dodge.width = 1, 
        seed = 50014), pch = 21, aes(fill = colour), size = 2.5) + theme_bw() + ylab("Normalized expression") + 
    xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p <- p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(5, 
    6), c(4, 5), c(4, 6)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 
    0.05, 1), symbols = c("****", "***", "**", "*", "ns")))
p

ggsave(paste0(files.out, "./Figure_5/Figure_5C_Diff.pdf"), p_all, width = 12, height = 7, 
    useDingbats = FALSE)


kable(table(hmDat2$RP.BarrettsAdjacent[hmDat2$Project.ID == "ICGC-ESAD"]))
Var1 Freq
yes 107
no 109
ND 0
# plot all data with cancers split according to Siewer Classification
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.SiewertClassification)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Siewert type") + geom_point(position = position_jitterdodge(jitter.width = 0.4, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.SiewertClassification), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

# plot all data with cancers split according to Tumour Differentiation Status
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.TumourGradingDifferentiationStatus)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Differentiation") + geom_point(position = position_jitterdodge(jitter.width = 0.05, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.TumourGradingDifferentiationStatus), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

# plot all data with cancers split according to Tumour stage
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.TStage.PrimaryTumour)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Stage") + geom_point(position = position_jitterdodge(jitter.width = 0.05, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.TStage.PrimaryTumour), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "BE Diff markers CAM BE, GC, D2 and EAC - z-score", 
    show_rownames = FALSE, cluster_rows = T, scale = "row", breaks = breaksListZ, 
    color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(length(breaksListZ)), 
    show_colnames = FALSE, annotation_colors = anno_col, file = paste0(files.out, 
        "./Figure_5/Figure_5C_Diff_heatmap.pdf"), width = 15)

Endocrine cells from BE only

# get marker genes for Endocrine cells
iiii <- c("Endocrine2")
genes.list <- vector()
for (i in iiii) {
    genes.list <- unique(c(genes.list, markers[[i]]$ID))
}

# plot z-score data
pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "BE Diff markers CAM BE, GC, D2 and EAC - z-score", 
    show_rownames = FALSE, cluster_rows = T, scale = "row", breaks = breaksListZ, 
    color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(length(breaksListZ)), 
    show_colnames = FALSE, annotation_colors = anno_col)  #,annotation_row = IDs)

# plot raw expression data
pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "BE Diff markers CAM BE, GC, D2 and EAC", 
    show_rownames = FALSE, cluster_rows = TRUE, cutree_cols = 1, show_colnames = FALSE, 
    annotation_colors = anno_col)  #,annotation_row = IDs)#, scale='row', breaks = breaksListZ,color = colorRampPalette(rev(brewer.pal(n = 7, name = 'RdBu')))(length(breaksListZ)))

########################################### Scaled to BE samples only########

# get the data for all important genes
hmDat2 <- hmDat[rownames(hmDat) %in% genes.list, ]

# scale the data around the mean expression in BE samples without any scaling
# factor. I use scale fuction to do it with center and scale arguments. Then
# calculate the median expression of scaled expression in each of the samples.

hmDat2 <- apply(apply(hmDat2, 1, function(x) scale(x, center = mean(x[rownames(rowID)[rowID$primary_diagnosis == 
    "Barrett's"]]), scale = FALSE)), 1, median)


# fix the names
names(hmDat2) <- colnames(hmDat[rownames(hmDat) %in% genes.list, ])

# merge the median expression per sample with other clinical information
hmDat2 <- merge(rowID, hmDat2, by = 0, all.y = TRUE)

# remove cancer samples without Barrett's status
hmDat2 <- hmDat2[!(hmDat2$RP.BarrettsAdjacent == "ND" & hmDat2$Project.ID == "ICGC-ESAD"), 
    ]
hmDat2$is.normal <- ifelse(hmDat2$Project.ID %in% c("Squamous", "Gastric", "Duodenum"), 
    "Normal", "NotNormal")
hmDat2$final <- paste0(hmDat2$Project.ID, hmDat2$is.normal, hmDat2$RP.BarrettsAdjacent)
hmDat2$colour <- paste0(hmDat2$is.normal, hmDat2$RP.BarrettsAdjacent)

# order the samples for the boxplots
hmDat2$Project.ID <- factor(hmDat2$Project.ID, levels = c("Squamous", "Gastric", 
    "Duodenum", "Barrett's", "ICGC-ESAD"))
hmDat2$RP.BarrettsAdjacent <- factor(hmDat2$RP.BarrettsAdjacent, levels = c("yes", 
    "no", "ND"))
hmDat2$RP.TumourGradingDifferentiationStatus <- factor(hmDat2$RP.TumourGradingDifferentiationStatus, 
    levels = c("poor", "moderate_to_poor", "moderate", "moderate_to_well", "well", 
        "unknown", NA))
hmDat2$final <- factor(hmDat2$final, levels = c("SquamousNormalND", "GastricNormalND", 
    "DuodenumNormalND", "Barrett'sNotNormalND", "ICGC-ESADNotNormalyes", "ICGC-ESADNotNormalno", 
    "ICGC-ESADNotNormalND"))


# plot all data without subgroups
p_all <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("All") + geom_point(position = position_jitterdodge(jitter.width = 0.5, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + 
    theme_bw() + ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))
p_all

# remove duodenum from the figure
hmDat3 <- hmDat2[hmDat2$primary_diagnosis != "Duodenum", ]
# plot all data without subgroups
p_all <- ggplot(hmDat3, aes(x = Project.ID, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle(paste0("All without Duodenum ", 
    length(genes.list))) + geom_point(position = position_jitterdodge(jitter.width = 0.5, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + 
    theme_bw() + ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(3, 
    4)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))
p_all

p_all <- ggplot(hmDat3, aes(x = final, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle(paste0("All without Duodenum ", 
    length(genes.list))) + geom_point(position = position_jitterdodge(jitter.width = 0.5, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + 
    theme_bw() + ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5), c(3, 4), c(3, 5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 
    0.05, 1), symbols = c("****", "***", "**", "*", "ns")))
p_all

p_endo_BE <- p_all + theme(legend.position = "none")

# plot all data with cancers split according to Barrett's status
p <- ggplot(hmDat2, aes(x = final, y = y, fill = colour)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.8) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("Barrett's next to") + 
    geom_point(position = position_jitterdodge(jitter.width = 0.4, dodge.width = 1, 
        seed = 50014), pch = 21, aes(fill = colour), size = 2.5) + theme_bw() + ylab("Normalized expression") + 
    xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p <- p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(5, 
    6), c(4, 5), c(4, 6)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 
    0.05, 1), symbols = c("****", "***", "**", "*", "ns")))
p

ggsave(paste0(files.out, "./Figure_5/Figure_5C_Endocrine.pdf"), p_all, width = 12, 
    height = 7, useDingbats = FALSE)


kable(table(hmDat2$RP.BarrettsAdjacent[hmDat2$Project.ID == "ICGC-ESAD"]))
Var1 Freq
yes 107
no 109
ND 0
# plot all data with cancers split according to Siewer Classification
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.SiewertClassification)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Siewert type") + geom_point(position = position_jitterdodge(jitter.width = 0.4, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.SiewertClassification), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

# plot all data with cancers split according to Tumour Differentiation Status
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.TumourGradingDifferentiationStatus)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Differentiation") + geom_point(position = position_jitterdodge(jitter.width = 0.05, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.TumourGradingDifferentiationStatus), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

# plot all data with cancers split according to Tumour stage
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.TStage.PrimaryTumour)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Stage") + geom_point(position = position_jitterdodge(jitter.width = 0.05, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.TStage.PrimaryTumour), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "BE Endocrine markers CAM BE, NG, ND and EAC - z-score", 
    show_rownames = FALSE, cluster_rows = T, scale = "row", breaks = breaksListZ, 
    color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(length(breaksListZ)), 
    show_colnames = FALSE, annotation_colors = anno_col, file = paste0(files.out, 
        "/Figure_5/Figure_5C_Endocrine_heatmap.pdf"), width = 15)

Endocrine and undifferentiated cells from BE only

# get marker genes for Endocrine cells
iiii <- c("Endocrine2", "BE_undiff_specific")
genes.list <- vector()
for (i in iiii) {
    genes.list <- unique(c(genes.list, markers[[i]]$ID))
}

# plot z-score data
pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "BE Diff markers CAM BE, GC, D2 and EAC - z-score", 
    show_rownames = FALSE, cluster_rows = T, scale = "row", breaks = breaksListZ, 
    color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(length(breaksListZ)), 
    show_colnames = FALSE, annotation_colors = anno_col)  #,annotation_row = IDs)

# plot raw expression data
pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "BE Diff markers CAM BE, GC, D2 and EAC", 
    show_rownames = FALSE, cluster_rows = TRUE, cutree_cols = 1, show_colnames = FALSE, 
    annotation_colors = anno_col)  #,annotation_row = IDs)#, scale='row', breaks = breaksListZ,color = colorRampPalette(rev(brewer.pal(n = 7, name = 'RdBu')))(length(breaksListZ)))

########################################### Scaled to BE samples only########

# get the data for all important genes
hmDat2 <- hmDat[rownames(hmDat) %in% genes.list, ]

# scale the data around the mean expression in BE samples without any scaling
# factor. I use scale fuction to do it with center and scale arguments. Then
# calculate the median expression of scaled expression in each of the samples.

hmDat2 <- apply(apply(hmDat2, 1, function(x) scale(x, center = mean(x[rownames(rowID)[rowID$primary_diagnosis == 
    "Barrett's"]]), scale = FALSE)), 1, median)


# fix the names
names(hmDat2) <- colnames(hmDat[rownames(hmDat) %in% genes.list, ])

# merge the median expression per sample with other clinical information
hmDat2 <- merge(rowID, hmDat2, by = 0, all.y = TRUE)

# remove cancer samples without Barrett's status
hmDat2 <- hmDat2[!(hmDat2$RP.BarrettsAdjacent == "ND" & hmDat2$Project.ID == "ICGC-ESAD"), 
    ]
hmDat2$is.normal <- ifelse(hmDat2$Project.ID %in% c("Squamous", "Gastric", "Duodenum"), 
    "Normal", "NotNormal")
hmDat2$final <- paste0(hmDat2$Project.ID, hmDat2$is.normal, hmDat2$RP.BarrettsAdjacent)
hmDat2$colour <- paste0(hmDat2$is.normal, hmDat2$RP.BarrettsAdjacent)

# order the samples for the boxplots
hmDat2$Project.ID <- factor(hmDat2$Project.ID, levels = c("Squamous", "Gastric", 
    "Duodenum", "Barrett's", "ICGC-ESAD"))
hmDat2$RP.BarrettsAdjacent <- factor(hmDat2$RP.BarrettsAdjacent, levels = c("yes", 
    "no", "ND"))
hmDat2$RP.TumourGradingDifferentiationStatus <- factor(hmDat2$RP.TumourGradingDifferentiationStatus, 
    levels = c("poor", "moderate_to_poor", "moderate", "moderate_to_well", "well", 
        "unknown", NA))
hmDat2$final <- factor(hmDat2$final, levels = c("SquamousNormalND", "GastricNormalND", 
    "DuodenumNormalND", "Barrett'sNotNormalND", "ICGC-ESADNotNormalyes", "ICGC-ESADNotNormalno", 
    "ICGC-ESADNotNormalND"))


# plot all data without subgroups
p_all <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("All") + geom_point(position = position_jitterdodge(jitter.width = 0.5, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + 
    theme_bw() + ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))
p_all

# remove duodenum from the figure
hmDat3 <- hmDat2[hmDat2$primary_diagnosis != "Duodenum", ]
# plot all data without subgroups
p_all <- ggplot(hmDat3, aes(x = Project.ID, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle(paste0("All without Duodenum", 
    length(genes.list))) + geom_point(position = position_jitterdodge(jitter.width = 0.5, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + 
    theme_bw() + ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(3, 
    4)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))
p_all

p_undiff <- p_all + theme(legend.position = "none")


p_all <- ggplot(hmDat3, aes(x = final, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle(paste0("All without Duodenum ", 
    length(genes.list))) + geom_point(position = position_jitterdodge(jitter.width = 0.5, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + 
    theme_bw() + ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5), c(3, 4), c(3, 5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 
    0.05, 1), symbols = c("****", "***", "**", "*", "ns")))
p_all

p_undiff_BE <- p_all + theme(legend.position = "none")

# plot all data with cancers split according to Barrett's status
p <- ggplot(hmDat2, aes(x = final, y = y, fill = colour)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.8) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("Barrett's next to") + 
    geom_point(position = position_jitterdodge(jitter.width = 0.4, dodge.width = 1, 
        seed = 50014), pch = 21, aes(fill = colour), size = 2.5) + theme_bw() + ylab("Normalized expression") + 
    xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p <- p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(5, 
    6), c(4, 5), c(4, 6)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 
    0.05, 1), symbols = c("****", "***", "**", "*", "ns")))
p

ggsave(paste0(files.out, "./Figure_5/Figure_5C_Undiff_Endocrine.pdf"), p_all, width = 12, 
    height = 7, useDingbats = FALSE)



all.figures <- grid.arrange(p_undiff, p_diff, nrow = 1)

ggsave(paste0(files.out, "./Figure_5/Figure_5C_Combined.pdf"), all.figures, width = 6, 
    height = 3, useDingbats = FALSE)


all.figures <- grid.arrange(p_undiff_BE, p_diff_BE, nrow = 1)

ggsave(paste0(files.out, "./Figure_5/Figure_5C_Combined_BE.pdf"), all.figures, width = 9, 
    height = 5, useDingbats = FALSE)

kable(table(hmDat2$RP.BarrettsAdjacent[hmDat2$Project.ID == "ICGC-ESAD"]))
Var1 Freq
yes 107
no 109
ND 0
# plot all data with cancers split according to Siewer Classification
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.SiewertClassification)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Siewert type") + geom_point(position = position_jitterdodge(jitter.width = 0.4, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.SiewertClassification), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

# plot all data with cancers split according to Tumour Differentiation Status
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.TumourGradingDifferentiationStatus)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Differentiation") + geom_point(position = position_jitterdodge(jitter.width = 0.05, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.TumourGradingDifferentiationStatus), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

# plot all data with cancers split according to Tumour stage
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.TStage.PrimaryTumour)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Stage") + geom_point(position = position_jitterdodge(jitter.width = 0.05, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.TStage.PrimaryTumour), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "BE Endocrine markers CAM BE, NG, ND and EAC - z-score", 
    show_rownames = FALSE, cluster_rows = T, scale = "row", breaks = breaksListZ, 
    color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(length(breaksListZ)), 
    show_colnames = FALSE, annotation_colors = anno_col, file = paste0(files.out, 
        "/Figure_5/Figure_5C_Undiff_Endocrine_heatmap.pdf"), width = 15)

Endocrine cells from BE - All

# get marker genes for Endocrine cells
iiii <- c("Endocrine")
genes.list <- vector()
for (i in iiii) {
    genes.list <- unique(c(genes.list, markers[[i]]$ID))
}

# plot z-score data
pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "BE Diff markers CAM BE, GC, D2 and EAC - z-score", 
    show_rownames = FALSE, cluster_rows = T, scale = "row", breaks = breaksListZ, 
    color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(length(breaksListZ)), 
    show_colnames = FALSE, annotation_colors = anno_col)  #,annotation_row = IDs)

# plot raw expression data
pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "BE Diff markers CAM BE, GC, D2 and EAC", 
    show_rownames = FALSE, cluster_rows = TRUE, cutree_cols = 1, show_colnames = FALSE, 
    annotation_colors = anno_col)  #,annotation_row = IDs)#, scale='row', breaks = breaksListZ,color = colorRampPalette(rev(brewer.pal(n = 7, name = 'RdBu')))(length(breaksListZ)))

########################################### Scaled to BE samples only########

# get the data for all important genes
hmDat2 <- hmDat[rownames(hmDat) %in% genes.list, ]

# scale the data around the mean expression in BE samples without any scaling
# factor. I use scale fuction to do it with center and scale arguments. Then
# calculate the median expression of scaled expression in each of the samples.

hmDat2 <- apply(apply(hmDat2, 1, function(x) scale(x, center = mean(x[rownames(rowID)[rowID$primary_diagnosis == 
    "Barrett's"]]), scale = FALSE)), 1, median)


# fix the names
names(hmDat2) <- colnames(hmDat[rownames(hmDat) %in% genes.list, ])

# merge the median expression per sample with other clinical information
hmDat2 <- merge(rowID, hmDat2, by = 0, all.y = TRUE)

# remove cancer samples without Barrett's status
hmDat2 <- hmDat2[!(hmDat2$RP.BarrettsAdjacent == "ND" & hmDat2$Project.ID == "ICGC-ESAD"), 
    ]
hmDat2$is.normal <- ifelse(hmDat2$Project.ID %in% c("Squamous", "Gastric", "Duodenum"), 
    "Normal", "NotNormal")
hmDat2$final <- paste0(hmDat2$Project.ID, hmDat2$is.normal, hmDat2$RP.BarrettsAdjacent)
hmDat2$colour <- paste0(hmDat2$is.normal, hmDat2$RP.BarrettsAdjacent)

# order the samples for the boxplots
hmDat2$Project.ID <- factor(hmDat2$Project.ID, levels = c("Squamous", "Gastric", 
    "Duodenum", "Barrett's", "ICGC-ESAD"))
hmDat2$RP.BarrettsAdjacent <- factor(hmDat2$RP.BarrettsAdjacent, levels = c("yes", 
    "no", "ND"))
hmDat2$RP.TumourGradingDifferentiationStatus <- factor(hmDat2$RP.TumourGradingDifferentiationStatus, 
    levels = c("poor", "moderate_to_poor", "moderate", "moderate_to_well", "well", 
        "unknown", NA))
hmDat2$final <- factor(hmDat2$final, levels = c("SquamousNormalND", "GastricNormalND", 
    "DuodenumNormalND", "Barrett'sNotNormalND", "ICGC-ESADNotNormalyes", "ICGC-ESADNotNormalno", 
    "ICGC-ESADNotNormalND"))


# plot all data without subgroups
p_all <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("All") + geom_point(position = position_jitterdodge(jitter.width = 0.5, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + 
    theme_bw() + ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))
p_all

# remove duodenum from the figure
hmDat3 <- hmDat2[hmDat2$primary_diagnosis != "Duodenum", ]
# plot all data without subgroups
p_all <- ggplot(hmDat3, aes(x = Project.ID, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("All without Duodenum") + 
    geom_point(position = position_jitterdodge(jitter.width = 0.5, dodge.width = 1, 
        seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + theme_bw() + 
    ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(3, 
    4)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))
p_all

# plot all data with cancers split according to Barrett's status
p <- ggplot(hmDat2, aes(x = final, y = y, fill = colour)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.8) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("Barrett's next to") + 
    geom_point(position = position_jitterdodge(jitter.width = 0.4, dodge.width = 1, 
        seed = 50014), pch = 21, aes(fill = colour), size = 2.5) + theme_bw() + ylab("Normalized expression") + 
    xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p <- p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(5, 
    6), c(4, 5), c(4, 6)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 
    0.05, 1), symbols = c("****", "***", "**", "*", "ns")))
p

ggsave(paste0(files.out, "./Figure_5/Figure_5C_Endocrine_all.pdf"), p_all, width = 12, 
    height = 7, useDingbats = FALSE)


kable(table(hmDat2$RP.BarrettsAdjacent[hmDat2$Project.ID == "ICGC-ESAD"]))
Var1 Freq
yes 107
no 109
ND 0
# plot all data with cancers split according to Siewer Classification
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.SiewertClassification)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Siewert type") + geom_point(position = position_jitterdodge(jitter.width = 0.4, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.SiewertClassification), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

# plot all data with cancers split according to Tumour Differentiation Status
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.TumourGradingDifferentiationStatus)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Differentiation") + geom_point(position = position_jitterdodge(jitter.width = 0.05, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.TumourGradingDifferentiationStatus), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

# plot all data with cancers split according to Tumour stage
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.TStage.PrimaryTumour)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Stage") + geom_point(position = position_jitterdodge(jitter.width = 0.05, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.TStage.PrimaryTumour), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "BE Endocrine markers CAM BE, NG, ND and EAC - z-score", 
    show_rownames = FALSE, cluster_rows = T, scale = "row", breaks = breaksListZ, 
    color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(length(breaksListZ)), 
    show_colnames = FALSE, annotation_colors = anno_col, file = paste0(files.out, 
        "/Figure_5/Figure_5C_Endocrine_heatmap_all.pdf"), width = 15)

Undifferentiated in BE only genes

iiii <- c("BE_undiff_specific")
genes.list <- vector()
for (i in iiii) {
    genes.list <- unique(c(genes.list, markers[[i]]$ID))
}

# plot z-score data
pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    3, 4, 6), drop = FALSE], clustering_method = "ward.D2", main = "BE Undifferentiated markers CAM BE, GC, D2 and EAC - z-score", 
    show_rownames = FALSE, cluster_rows = T, scale = "row", breaks = breaksListZ, 
    color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(length(breaksListZ)), 
    show_colnames = FALSE, annotation_colors = anno_col)

# plot raw expression data
pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "BE Undifferentiated markers CAM BE, GC, D2 and EAC", 
    show_rownames = FALSE, cluster_rows = TRUE, show_colnames = FALSE, annotation_colors = anno_col)

# , scale='row', breaks = breaksListZ,color = colorRampPalette(rev(brewer.pal(n =
# 7, name = 'RdBu')))(length(breaksListZ)))


########################################### Scaled to BE samples only########

# get the data for all important genes
hmDat2 <- hmDat[rownames(hmDat) %in% genes.list, ]

# scale the data around the mean expression in BE samples without any scaling
# factor. I use scale fuction to do it with center and scale arguments. Then
# calculate the median expression of scaled expression in each of the samples.

hmDat2 <- apply(apply(hmDat2, 1, function(x) scale(x, center = mean(x[rownames(rowID)[rowID$primary_diagnosis == 
    "Barrett's"]]), scale = FALSE)), 1, median)


# fix the names
names(hmDat2) <- colnames(hmDat[rownames(hmDat) %in% genes.list, ])

# merge the median expression per sample with other clinical information
hmDat2 <- merge(rowID, hmDat2, by = 0, all.y = TRUE)

# remove cancer samples without Barrett's status
hmDat2 <- hmDat2[!(hmDat2$RP.BarrettsAdjacent == "ND" & hmDat2$Project.ID == "ICGC-ESAD"), 
    ]
hmDat2$is.normal <- ifelse(hmDat2$Project.ID %in% c("Squamous", "Gastric", "Duodenum"), 
    "Normal", "NotNormal")
hmDat2$final <- paste0(hmDat2$Project.ID, hmDat2$is.normal, hmDat2$RP.BarrettsAdjacent)
hmDat2$colour <- paste0(hmDat2$is.normal, hmDat2$RP.BarrettsAdjacent)

# order the samples for the boxplots
hmDat2$Project.ID <- factor(hmDat2$Project.ID, levels = c("Squamous", "Gastric", 
    "Duodenum", "Barrett's", "ICGC-ESAD"))
hmDat2$RP.BarrettsAdjacent <- factor(hmDat2$RP.BarrettsAdjacent, levels = c("yes", 
    "no", "ND"))
hmDat2$RP.TumourGradingDifferentiationStatus <- factor(hmDat2$RP.TumourGradingDifferentiationStatus, 
    levels = c("poor", "moderate_to_poor", "moderate", "moderate_to_well", "well", 
        "unknown", NA))
hmDat2$final <- factor(hmDat2$final, levels = c("SquamousNormalND", "GastricNormalND", 
    "DuodenumNormalND", "Barrett'sNotNormalND", "ICGC-ESADNotNormalyes", "ICGC-ESADNotNormalno", 
    "ICGC-ESADNotNormalND"))


# plot all data without subgroups
p_all <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("All") + geom_point(position = position_jitterdodge(jitter.width = 0.5, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + 
    theme_bw() + ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))
p_all

# remove duodenum from the figure
hmDat3 <- hmDat2[hmDat2$primary_diagnosis != "Duodenum", ]
# plot all data without subgroups
p_all <- ggplot(hmDat3, aes(x = Project.ID, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("All without Duodenum") + 
    geom_point(position = position_jitterdodge(jitter.width = 0.5, dodge.width = 1, 
        seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + theme_bw() + 
    ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(3, 
    4)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))
p_all

# plot all data with cancers split according to Barrett's status
p <- ggplot(hmDat2, aes(x = final, y = y, fill = colour)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.8) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("Barrett's next to") + 
    geom_point(position = position_jitterdodge(jitter.width = 0.4, dodge.width = 1, 
        seed = 50014), pch = 21, aes(fill = colour), size = 2.5) + theme_bw() + ylab("Normalized expression") + 
    xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p <- p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(5, 
    6), c(4, 5), c(4, 6)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 
    0.05, 1), symbols = c("****", "***", "**", "*", "ns")))
p

ggsave(paste0(files.out, "./Figure_5/Figure_5D_Undiff.pdf"), p_all, width = 12, height = 7, 
    useDingbats = FALSE)


# wilcox.test(hmDat2$value[hmDat2$RP.BarrettsAdjacent == 'no'],
# hmDat2$value[hmDat2$RP.BarrettsAdjacent == 'yes'])


# wilcox.test(hmDat2.1$value[hmDat2.1$RP.BarrettsAdjacent == 'no'],
# hmDat2.1$value[hmDat2.1$RP.BarrettsAdjacent == 'ND' & hmDat2.1$Project.ID ==
# 'ICGC-ESAD']) wilcox.test(hmDat2.1$value[hmDat2.1$RP.BarrettsAdjacent ==
# 'yes'], hmDat2.1$value[hmDat2.1$RP.BarrettsAdjacent == 'ND'&
# hmDat2.1$Project.ID == 'ICGC-ESAD'])

kable(table(hmDat2$RP.BarrettsAdjacent[hmDat2$Project.ID == "ICGC-ESAD"]))
Var1 Freq
yes 107
no 109
ND 0
# plot all data with cancers split according to Siewer Classification
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.SiewertClassification)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Siewert type") + geom_point(position = position_jitterdodge(jitter.width = 0.4, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.SiewertClassification), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

# plot all data with cancers split according to Tumour Differentiation Status
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.TumourGradingDifferentiationStatus)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Differentiation") + geom_point(position = position_jitterdodge(jitter.width = 0.05, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.TumourGradingDifferentiationStatus), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

# plot all data with cancers split according to Tumour stage
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.TStage.PrimaryTumour)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Stage") + geom_point(position = position_jitterdodge(jitter.width = 0.05, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.TStage.PrimaryTumour), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "BE Undiff markers CAM BE, GC, D2 and EAC - z-score", 
    show_rownames = FALSE, cluster_rows = T, scale = "row", breaks = breaksListZ, 
    color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(length(breaksListZ)), 
    show_colnames = FALSE, annotation_colors = anno_col, file = paste0(files.out, 
        "/Figure_5/Figure_5D_Undiff_heatmap.pdf"), width = 15)

Differentiated in GC only genes

iiii <- c("NG_diff_specific")
genes.list <- vector()
for (i in iiii) {
    genes.list <- unique(c(genes.list, markers[[i]]$ID))
}

# plot z-score data
pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    3, 4, 6), drop = FALSE], clustering_method = "ward.D2", main = "BE Undifferentiated markers CAM BE, GC, D2 and EAC - z-score", 
    show_rownames = FALSE, cluster_rows = T, scale = "row", breaks = breaksListZ, 
    color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(length(breaksListZ)), 
    show_colnames = FALSE, annotation_colors = anno_col)

# plot raw expression data
pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "BE Undifferentiated markers CAM BE, GC, D2 and EAC", 
    show_rownames = FALSE, cluster_rows = TRUE, show_colnames = FALSE, annotation_colors = anno_col)

# , scale='row', breaks = breaksListZ,color = colorRampPalette(rev(brewer.pal(n =
# 7, name = 'RdBu')))(length(breaksListZ)))


########################################### Scaled to GC samples only########

# get the data for all important genes
hmDat2 <- hmDat[rownames(hmDat) %in% genes.list, ]

# scale the data around the mean expression in NG samples without any scaling
# factor. I use scale fuction to do it with center and scale arguments. Then
# calculate the median expression of scaled expression in each of the samples.

hmDat2 <- apply(apply(hmDat2, 1, function(x) scale(x, center = mean(x[rownames(rowID)[rowID$primary_diagnosis == 
    "Gastric"]]), scale = FALSE)), 1, median)

# fix the names
names(hmDat2) <- colnames(hmDat[rownames(hmDat) %in% genes.list, ])

# merge the median expression per sample with other clinical information
hmDat2 <- merge(rowID, hmDat2, by = 0, all.y = TRUE)

# remove cancer samples without Barrett's status
hmDat2 <- hmDat2[!(hmDat2$RP.BarrettsAdjacent == "ND" & hmDat2$Project.ID == "ICGC-ESAD"), 
    ]
hmDat2$is.normal <- ifelse(hmDat2$Project.ID %in% c("Squamous", "Gastric", "Duodenum"), 
    "Normal", "NotNormal")
hmDat2$final <- paste0(hmDat2$Project.ID, hmDat2$is.normal, hmDat2$RP.BarrettsAdjacent)
hmDat2$colour <- paste0(hmDat2$is.normal, hmDat2$RP.BarrettsAdjacent)

# order the samples for the boxplots
hmDat2$Project.ID <- factor(hmDat2$Project.ID, levels = c("Squamous", "Gastric", 
    "Duodenum", "Barrett's", "ICGC-ESAD"))
hmDat2$RP.BarrettsAdjacent <- factor(hmDat2$RP.BarrettsAdjacent, levels = c("yes", 
    "no", "ND"))
hmDat2$RP.TumourGradingDifferentiationStatus <- factor(hmDat2$RP.TumourGradingDifferentiationStatus, 
    levels = c("poor", "moderate_to_poor", "moderate", "moderate_to_well", "well", 
        "unknown", NA))
hmDat2$final <- factor(hmDat2$final, levels = c("SquamousNormalND", "GastricNormalND", 
    "DuodenumNormalND", "Barrett'sNotNormalND", "ICGC-ESADNotNormalyes", "ICGC-ESADNotNormalno", 
    "ICGC-ESADNotNormalND"))


# plot all data without subgroups
p_all <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("All") + geom_point(position = position_jitterdodge(jitter.width = 0.5, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + 
    theme_bw() + ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(2, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))
p_all

# remove duodenum from the figure
hmDat3 <- hmDat2[hmDat2$primary_diagnosis != "Duodenum", ]
# plot all data without subgroups
p_all <- ggplot(hmDat3, aes(x = Project.ID, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("All without Duodenum") + 
    geom_point(position = position_jitterdodge(jitter.width = 0.5, dodge.width = 1, 
        seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + theme_bw() + 
    ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(2, 
    4)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))
p_all

# plot all data with cancers split according to Barrett's status
p <- ggplot(hmDat2, aes(x = final, y = y, fill = colour)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.8) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("Barrett's next to") + 
    geom_point(position = position_jitterdodge(jitter.width = 0.4, dodge.width = 1, 
        seed = 50014), pch = 21, aes(fill = colour), size = 2.5) + theme_bw() + ylab("Normalized expression") + 
    xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p <- p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(5, 
    6), c(2, 5), c(2, 6)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 
    0.05, 1), symbols = c("****", "***", "**", "*", "ns")))
p

ggsave(paste0(files.out, "./Supplementary_figures/S18/Figure_S18A_NG_Diff.pdf"), 
    p_all, width = 12, height = 7, useDingbats = FALSE)

p_all <- ggplot(hmDat3, aes(x = final, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle(paste0("All without Duodenum ", 
    length(genes.list))) + geom_point(position = position_jitterdodge(jitter.width = 0.5, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + 
    theme_bw() + ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5), c(2, 4), c(2, 5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 
    0.05, 1), symbols = c("****", "***", "**", "*", "ns")))
p_all

p_all <- p_all + theme(legend.position = "none")
ggsave(paste0(files.out, "./Supplementary_figures/S18/Figure_S18A_NG_Diff_BE.pdf"), 
    p_all, width = 12, height = 7, useDingbats = FALSE)

kable(table(hmDat2$RP.BarrettsAdjacent[hmDat2$Project.ID == "ICGC-ESAD"]))
Var1 Freq
yes 107
no 109
ND 0
# plot all data with cancers split according to Siewer Classification
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.SiewertClassification)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Siewert type") + geom_point(position = position_jitterdodge(jitter.width = 0.4, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.SiewertClassification), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(2, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

# plot all data with cancers split according to Tumour Differentiation Status
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.TumourGradingDifferentiationStatus)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Differentiation") + geom_point(position = position_jitterdodge(jitter.width = 0.05, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.TumourGradingDifferentiationStatus), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(2, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

# plot all data with cancers split according to Tumour stage
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.TStage.PrimaryTumour)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Stage") + geom_point(position = position_jitterdodge(jitter.width = 0.05, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.TStage.PrimaryTumour), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(2, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "NG Diff markers CAM BE, GC, D2 and EAC - z-score", 
    show_rownames = FALSE, cluster_rows = T, scale = "row", breaks = breaksListZ, 
    color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(length(breaksListZ)), 
    show_colnames = FALSE, annotation_colors = anno_col, file = paste0(files.out, 
        "/Supplementary_figures/S18/Figure_S18A_NG_Diff_heatmap.pdf"), width = 15)

Undifferentiated in GC only genes

iiii <- c("NG_undiff_specific")
genes.list <- vector()
for (i in iiii) {
    genes.list <- unique(c(genes.list, markers[[i]]$ID))
}


# plot z-score data
pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    3, 4, 6), drop = FALSE], clustering_method = "ward.D2", main = "BE Undifferentiated markers CAM BE, GC, D2 and EAC - z-score", 
    show_rownames = FALSE, cluster_rows = T, scale = "row", breaks = breaksListZ, 
    color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(length(breaksListZ)), 
    show_colnames = FALSE, annotation_colors = anno_col)

# plot raw expression data
pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "BE Undifferentiated markers CAM BE, GC, D2 and EAC", 
    show_rownames = FALSE, cluster_rows = TRUE, show_colnames = FALSE, annotation_colors = anno_col)

# , scale='row', breaks = breaksListZ,color = colorRampPalette(rev(brewer.pal(n =
# 7, name = 'RdBu')))(length(breaksListZ)))


########################################### Scaled to GC samples only########

# get the data for all important genes
hmDat2 <- hmDat[rownames(hmDat) %in% genes.list, ]

# scale the data around the mean expression in NG samples without any scaling
# factor. I use scale fuction to do it with center and scale arguments. Then
# calculate the median expression of scaled expression in each of the samples.

hmDat2 <- apply(apply(hmDat2, 1, function(x) scale(x, center = mean(x[rownames(rowID)[rowID$primary_diagnosis == 
    "Gastric"]]), scale = FALSE)), 1, median)

# fix the names
names(hmDat2) <- colnames(hmDat[rownames(hmDat) %in% genes.list, ])

# merge the median expression per sample with other clinical information
hmDat2 <- merge(rowID, hmDat2, by = 0, all.y = TRUE)

# remove cancer samples without Barrett's status
hmDat2 <- hmDat2[!(hmDat2$RP.BarrettsAdjacent == "ND" & hmDat2$Project.ID == "ICGC-ESAD"), 
    ]
hmDat2$is.normal <- ifelse(hmDat2$Project.ID %in% c("Squamous", "Gastric", "Duodenum"), 
    "Normal", "NotNormal")
hmDat2$final <- paste0(hmDat2$Project.ID, hmDat2$is.normal, hmDat2$RP.BarrettsAdjacent)
hmDat2$colour <- paste0(hmDat2$is.normal, hmDat2$RP.BarrettsAdjacent)

# order the samples for the boxplots
hmDat2$Project.ID <- factor(hmDat2$Project.ID, levels = c("Squamous", "Gastric", 
    "Duodenum", "Barrett's", "ICGC-ESAD"))
hmDat2$RP.BarrettsAdjacent <- factor(hmDat2$RP.BarrettsAdjacent, levels = c("yes", 
    "no", "ND"))
hmDat2$RP.TumourGradingDifferentiationStatus <- factor(hmDat2$RP.TumourGradingDifferentiationStatus, 
    levels = c("poor", "moderate_to_poor", "moderate", "moderate_to_well", "well", 
        "unknown", NA))
hmDat2$final <- factor(hmDat2$final, levels = c("SquamousNormalND", "GastricNormalND", 
    "DuodenumNormalND", "Barrett'sNotNormalND", "ICGC-ESADNotNormalyes", "ICGC-ESADNotNormalno", 
    "ICGC-ESADNotNormalND"))


# plot all data without subgroups
p_all <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("All") + geom_point(position = position_jitterdodge(jitter.width = 0.5, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + 
    theme_bw() + ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(2, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))
p_all

# remove duodenum from the figure
hmDat3 <- hmDat2[hmDat2$primary_diagnosis != "Duodenum", ]
# plot all data without subgroups
p_all <- ggplot(hmDat3, aes(x = Project.ID, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("All without Duodenum") + 
    geom_point(position = position_jitterdodge(jitter.width = 0.5, dodge.width = 1, 
        seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + theme_bw() + 
    ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(2, 
    4)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))
p_all

# plot all data with cancers split according to Barrett's status
p <- ggplot(hmDat2, aes(x = final, y = y, fill = colour)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.8) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle("Barrett's next to") + 
    geom_point(position = position_jitterdodge(jitter.width = 0.4, dodge.width = 1, 
        seed = 50014), pch = 21, aes(fill = colour), size = 2.5) + theme_bw() + ylab("Normalized expression") + 
    xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p <- p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(5, 
    6), c(2, 5), c(2, 6)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 
    0.05, 1), symbols = c("****", "***", "**", "*", "ns")))
p

ggsave(paste0(files.out, "./Supplementary_figures/S18/Figure_S18B_NG_Undiff.pdf"), 
    p_all, width = 12, height = 7, useDingbats = FALSE)

p_all <- ggplot(hmDat3, aes(x = final, y = y, fill = Project.ID)) + geom_boxplot(position = position_dodge2(1, 
    preserve = "single"), outlier.alpha = 0, width = 0.75) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1), plot.title = element_text(hjust = 0.5)) + ggtitle(paste0("All without Duodenum ", 
    length(genes.list))) + geom_point(position = position_jitterdodge(jitter.width = 0.5, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = Project.ID), size = 2.5) + 
    theme_bw() + ylab("Normalized expression") + xlab("Sample Type") + scale_fill_manual(values = anno_col$Project.ID)  #c('#BBDCAFFF', '#BBDCAFFF', '#BBDCAFFF', '#00A087FF', '#E0B400FF'))
p_all <- p_all + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(4, 
    5), c(2, 4), c(2, 5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 
    0.05, 1), symbols = c("****", "***", "**", "*", "ns")))
p_all

p_all <- p_all + theme(legend.position = "none")
ggsave(paste0(files.out, "./Supplementary_figures/S18/Figure_S18A_NG_Undiff_BE.pdf"), 
    p_all, width = 12, height = 7, useDingbats = FALSE)

# wilcox.test(hmDat2$value[hmDat2$RP.BarrettsAdjacent == 'no'],
# hmDat2$value[hmDat2$RP.BarrettsAdjacent == 'yes'])


# wilcox.test(hmDat2.1$value[hmDat2.1$RP.BarrettsAdjacent == 'no'],
# hmDat2.1$value[hmDat2.1$RP.BarrettsAdjacent == 'ND' & hmDat2.1$Project.ID ==
# 'ICGC-ESAD']) wilcox.test(hmDat2.1$value[hmDat2.1$RP.BarrettsAdjacent ==
# 'yes'], hmDat2.1$value[hmDat2.1$RP.BarrettsAdjacent == 'ND'&
# hmDat2.1$Project.ID == 'ICGC-ESAD'])

kable(table(hmDat2$RP.BarrettsAdjacent[hmDat2$Project.ID == "ICGC-ESAD"]))
Var1 Freq
yes 107
no 109
ND 0
# plot all data with cancers split according to Siewer Classification
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.SiewertClassification)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Siewert type") + geom_point(position = position_jitterdodge(jitter.width = 0.4, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.SiewertClassification), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(2, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

# plot all data with cancers split according to Tumour Differentiation Status
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.TumourGradingDifferentiationStatus)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Differentiation") + geom_point(position = position_jitterdodge(jitter.width = 0.05, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.TumourGradingDifferentiationStatus), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(2, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

# plot all data with cancers split according to Tumour stage
p <- ggplot(hmDat2, aes(x = Project.ID, y = y, fill = RP.TStage.PrimaryTumour)) + 
    geom_boxplot(position = position_dodge2(1, preserve = "single"), outlier.alpha = 0, 
        width = 1) + theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) + 
    ggtitle("Stage") + geom_point(position = position_jitterdodge(jitter.width = 0.05, 
    dodge.width = 1, seed = 50014), pch = 21, aes(fill = RP.TStage.PrimaryTumour), 
    size = 2.5) + theme_bw() + ylab("Normalized expression") + xlab("Sample Type")
# p+stat_compare_means(method = 'wilcox.test', paired = FALSE, comparisons =
# rev(list(c('Barrett's', 'ICGC-ESAD'),c('Barrett's', 'Gastric'),c('Barrett's',
# 'Duodenum'), c('no', 'yes'))) )
p + stat_compare_means(method = "wilcox.test", paired = FALSE, comparisons = list(c(2, 
    5)), symnum.args = list(cutpoints = c(0, 1e-04, 0.001, 0.01, 0.05, 1), symbols = c("****", 
    "***", "**", "*", "ns")))

pheatmap(hmDat[rownames(hmDat) %in% genes.list, ], annotation_col = rowID[, c(1, 
    6), drop = FALSE], clustering_method = "ward.D2", main = "NG Undiff markers CAM BE, GC, D2 and EAC - z-score", 
    show_rownames = FALSE, cluster_rows = T, scale = "row", breaks = breaksListZ, 
    color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(length(breaksListZ)), 
    show_colnames = FALSE, annotation_colors = anno_col, file = paste0(files.out, 
        "/Supplementary_figures/S18/Figure_S18B_NG_Undiff_heatmap.pdf"), width = 15)

End Matters

To finish get session info: R version 3.6.2 (2019-12-12) Platform: x86_64-redhat-linux-gnu (64-bit) Running under: Fedora 31 (Workstation Edition)

Matrix products: default BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so

locale: [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
[5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
[7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C

attached base packages: [1] parallel stats4 stats graphics grDevices utils datasets [8] methods base

other attached packages: [1] destiny_3.0.1 dbscan_1.1-5
[3] princurve_2.1.4 dynamicTreeCut_1.63-1
[5] scran_1.14.6 scater_1.14.6
[7] SingleCellExperiment_1.8.0 SummarizedExperiment_1.16.1 [9] DelayedArray_0.12.2 BiocParallel_1.20.1
[11] matrixStats_0.55.0 Biobase_2.46.0
[13] GenomicRanges_1.38.0 GenomeInfoDb_1.22.0
[15] IRanges_2.20.2 S4Vectors_0.24.3
[17] BiocGenerics_0.32.0 gridExtra_2.3
[19] knitr_1.28 ggpubr_0.2.5
[21] magrittr_1.5 reshape2_1.4.3
[23] RColorBrewer_1.1-2 openxlsx_4.1.4
[25] pheatmap_1.0.12 ggplot2_3.2.1
[27] edgeR_3.28.0 limma_3.42.2
[29] rmarkdown_2.1

loaded via a namespace (and not attached): [1] ggbeeswarm_0.6.0 colorspace_1.4-1 RcppEigen_0.3.3.7.0
[4] ggsignif_0.6.0 class_7.3-15 rio_0.5.16
[7] XVector_0.26.0 RcppHNSW_0.2.0 BiocNeighbors_1.4.1
[10] proxy_0.4-23 hexbin_1.28.1 farver_2.0.3
[13] RSpectra_0.16-0 ranger_0.12.1 codetools_0.2-16
[16] splines_3.6.2 robustbase_0.93-5 compiler_3.6.2
[19] ggplot.multistats_1.0.0 dqrng_0.2.1 assertthat_0.2.1
[22] Matrix_1.2-18 lazyeval_0.2.2 BiocSingular_1.2.1
[25] formatR_1.7 htmltools_0.4.0 tools_3.6.2
[28] rsvd_1.0.2 igraph_1.2.4.2 gtable_0.3.0
[31] glue_1.3.1 GenomeInfoDbData_1.2.2 dplyr_0.8.4
[34] ggthemes_4.2.0 Rcpp_1.0.3 carData_3.0-3
[37] cellranger_1.1.0 vctrs_0.2.2 DelayedMatrixStats_1.8.0 [40] lmtest_0.9-37 xfun_0.12 laeken_0.5.1
[43] stringr_1.4.0 lifecycle_0.1.0 irlba_2.3.3
[46] statmod_1.4.33 DEoptimR_1.0-8 zoo_1.8-7
[49] zlibbioc_1.32.0 MASS_7.3-51.4 scales_1.1.0
[52] VIM_5.1.0 pcaMethods_1.78.0 hms_0.5.3
[55] yaml_2.2.1 curl_4.3 stringi_1.4.5
[58] highr_0.8 e1071_1.7-3 TTR_0.23-6
[61] boot_1.3-23 zip_2.0.4 rlang_0.4.4
[64] pkgconfig_2.0.3 bitops_1.0-6 evaluate_0.14
[67] lattice_0.20-38 purrr_0.3.3 labeling_0.3
[70] tidyselect_1.0.0 plyr_1.8.5 R6_2.4.1
[73] pillar_1.4.3 haven_2.2.0 foreign_0.8-72
[76] withr_2.1.2 xts_0.12-0 scatterplot3d_0.3-41
[79] abind_1.4-5 RCurl_1.98-1.1 sp_1.3-2
[82] nnet_7.3-12 tibble_2.1.3 crayon_1.3.4
[85] car_3.0-6 viridis_0.5.1 locfit_1.5-9.1
[88] grid_3.6.2 readxl_1.3.1 data.table_1.12.8
[91] forcats_0.4.0 vcd_1.4-5 digest_0.6.24
[94] tidyr_1.0.2 munsell_0.5.0 beeswarm_0.2.3
[97] viridisLite_0.3.0 smoother_1.1 vipor_0.4.5